import scrapy
import re
from gallery.items import GalleryItem

class GallerSpider(scrapy.Spider):
    name = 'galler'
    allowed_domains = ['m.huajia.cc']
    start_urls = ['http://m.huajia.cc/']
    nums=1

    def parse(self, response):#进入网站首页
        yield scrapy.Request(self.start_urls[0]+'hall.php',callback=self.parse_title)

    def parse_title(self,response):#进入详情页
        st = response.xpath('/html/body/div/div[2]/div[2]/div')
        for div in st:#获取不同美术馆
            if not div.xpath('.//div[1]/img//@src').extract_first() is None:
                srimg = div.xpath('.//div[1]/img//@src').extract_first()
                #                   //div[2]/div/span[1]
                srname= div.xpath('.//div[3]/div/span[1]//text()').extract_first()
                #                   //div[2]/div
                fileurl=div.xpath('.//div[3]/div//@onclick').extract_first()
                fileurl = re.findall("\'(.*?)\'",fileurl)[0]
                fileurl = self.start_urls[0]+fileurl
            else:
                srimg = ''
                srname= div.xpath('.//div[2]/div/span[1]//text()').extract_first()
                fileurl=div.xpath('.//div[2]/div//@onclick').extract_first()
                fileurl = re.findall("\'(.*?)\'",fileurl)[0]
                fileurl = self.start_urls[0]+fileurl
            yield scrapy.Request(fileurl,callback=self.parse_detail,meta={'img':srimg,'name':srname,'fileurl':fileurl})
        #翻页
        self.nums=self.nums+1
        if self.nums>15:
            return
        yield scrapy.Request(self.start_urls[0]+'hall.php?page='+str(self.nums),callback=self.parse_title)


    def parse_detail(self,response):#进入详情页
        item=GalleryItem()
        item['srname'] = response.meta['name']
        item['srimgs'] = response.meta['img']
        item['fileurl'] = response.meta['fileurl']
        
        st = response.xpath('/html/body/div/div[2]/div[2]/div[1]')
        html = st.xpath('string(.)').extract()
        html = ''.join(html)
        html = re.sub('\r','',html)
        html = re.sub('\t','',html)
        html = re.sub('\n','',html)
        html = re.sub('\xa0','',html)
        html = re.sub('\u3000','',html)
        html= html.split('简介：')
      
        item['contact'] = html[0]
        item['introduction'] = html[1]
    
        yield item